we first need to load the required libraries and read our dataset Load necessary library

library(ggplot2)
library(readr)
library(tidymodels)  
library(splines) 
library(mgcv)
library(caret)
library(earth)
library(pdp)
library(gridExtra)
library(plotly)
library(patchwork)

Next, we need to transform the R datafile into CSV format and read the datafile

load("dat1.RData")
load("dat2.RData")

# Save as CSVs
write.csv(dat1, "dat1.csv", row.names = FALSE)
write.csv(dat2, "dat2.csv", row.names = FALSE)

dat1 = read_csv("dat1.csv",  na = c("NA","","."))
dat2 = read_csv("dat2.csv",na = c("NA","","."))

glimpse(dat1)
## Rows: 5,000
## Columns: 14
## $ id           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ age          <dbl> 50, 71, 58, 63, 56, 59, 67, 62, 60, 64, 61, 67, 60, 60, 5…
## $ gender       <dbl> 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, …
## $ race         <dbl> 1, 1, 1, 1, 1, 3, 4, 1, 4, 1, 1, 3, 4, 1, 1, 3, 3, 2, 3, …
## $ smoking      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, …
## $ height       <dbl> 176.1, 175.7, 168.7, 167.4, 162.7, 167.8, 175.4, 176.5, 1…
## $ weight       <dbl> 68.3, 69.6, 76.9, 90.0, 83.9, 86.8, 91.4, 87.7, 85.7, 76.…
## $ bmi          <dbl> 22.0, 22.6, 27.0, 32.1, 31.7, 30.8, 29.7, 28.1, 29.0, 31.…
## $ diabetes     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, …
## $ hypertension <dbl> 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, …
## $ SBP          <dbl> 130, 149, 127, 138, 123, 132, 133, 130, 129, 134, 140, 12…
## $ LDL          <dbl> 82, 129, 101, 93, 97, 108, 89, 96, 120, 135, 92, 131, 120…
## $ time         <dbl> 76, 82, 168, 105, 193, 143, 63, 78, 61, 88, 90, 105, 120,…
## $ log_antibody <dbl> 10.647154, 9.889049, 10.900712, 9.906258, 9.563081, 8.837…
glimpse(dat2)
## Rows: 1,000
## Columns: 14
## $ id           <dbl> 5001, 5002, 5003, 5004, 5005, 5006, 5007, 5008, 5009, 501…
## $ age          <dbl> 58, 62, 71, 59, 69, 56, 65, 61, 62, 68, 57, 56, 64, 60, 5…
## $ gender       <dbl> 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, …
## $ race         <dbl> 4, 1, 4, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 4, 3, 1, …
## $ smoking      <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ height       <dbl> 176.4, 167.5, 179.3, 170.0, 166.5, 167.6, 175.9, 172.1, 1…
## $ weight       <dbl> 86.4, 82.4, 79.2, 81.0, 74.8, 74.8, 69.2, 81.3, 82.1, 74.…
## $ bmi          <dbl> 27.7, 29.4, 24.6, 28.0, 27.0, 26.6, 22.4, 27.4, 30.7, 26.…
## $ diabetes     <dbl> 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ hypertension <dbl> 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, …
## $ SBP          <dbl> 130, 123, 145, 123, 150, 121, 132, 120, 142, 137, 127, 13…
## $ LDL          <dbl> 115, 118, 149, 119, 142, 112, 127, 76, 86, 123, 140, 112,…
## $ time         <dbl> 205, 229, 206, 163, 240, 206, 285, 185, 124, 127, 98, 199…
## $ log_antibody <dbl> 9.810890, 9.076660, 10.432296, 9.831918, 9.074990, 10.182…

Next, we will explore dat1 and perform Exploratory analysis

dat1 %>% 
  select(where(is.numeric)) %>% 
  summary()
##        id            age            gender            race      
##  Min.   :   1   Min.   :44.00   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:1251   1st Qu.:57.00   1st Qu.:0.0000   1st Qu.:1.000  
##  Median :2500   Median :60.00   Median :0.0000   Median :1.000  
##  Mean   :2500   Mean   :59.97   Mean   :0.4854   Mean   :1.749  
##  3rd Qu.:3750   3rd Qu.:63.00   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :5000   Max.   :75.00   Max.   :1.0000   Max.   :4.000  
##     smoking           height          weight            bmi       
##  Min.   :0.0000   Min.   :150.2   Min.   : 56.70   Min.   :18.20  
##  1st Qu.:0.0000   1st Qu.:166.1   1st Qu.: 75.40   1st Qu.:25.80  
##  Median :0.0000   Median :170.1   Median : 80.10   Median :27.60  
##  Mean   :0.4952   Mean   :170.1   Mean   : 80.11   Mean   :27.74  
##  3rd Qu.:1.0000   3rd Qu.:174.2   3rd Qu.: 84.90   3rd Qu.:29.50  
##  Max.   :2.0000   Max.   :192.9   Max.   :106.00   Max.   :38.80  
##     diabetes       hypertension         SBP             LDL       
##  Min.   :0.0000   Min.   :0.0000   Min.   :101.0   Min.   : 43.0  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:124.0   1st Qu.: 96.0  
##  Median :0.0000   Median :0.0000   Median :130.0   Median :110.0  
##  Mean   :0.1544   Mean   :0.4596   Mean   :129.9   Mean   :109.9  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:135.0   3rd Qu.:124.0  
##  Max.   :1.0000   Max.   :1.0000   Max.   :155.0   Max.   :185.0  
##       time        log_antibody   
##  Min.   : 30.0   Min.   : 7.765  
##  1st Qu.: 76.0   1st Qu.: 9.682  
##  Median :106.0   Median :10.089  
##  Mean   :108.9   Mean   :10.064  
##  3rd Qu.:138.0   3rd Qu.:10.478  
##  Max.   :270.0   Max.   :11.961
# Count and proportion tables for categorical variables
dat1 %>% 
  select(gender, race, smoking, diabetes, hypertension) %>% 
  map(~table(.))  
## $gender
## .
##    0    1 
## 2573 2427 
## 
## $race
## .
##    1    2    3    4 
## 3221  278 1036  465 
## 
## $smoking
## .
##    0    1    2 
## 3010 1504  486 
## 
## $diabetes
## .
##    0    1 
## 4228  772 
## 
## $hypertension
## .
##    0    1 
## 2702 2298
dat1 %>% 
  select(gender, race, smoking, diabetes, hypertension) %>% 
  map(~prop.table(table(.)))  
## $gender
## .
##      0      1 
## 0.5146 0.4854 
## 
## $race
## .
##      1      2      3      4 
## 0.6442 0.0556 0.2072 0.0930 
## 
## $smoking
## .
##      0      1      2 
## 0.6020 0.3008 0.0972 
## 
## $diabetes
## .
##      0      1 
## 0.8456 0.1544 
## 
## $hypertension
## .
##      0      1 
## 0.5404 0.4596

Exploratory Visualizations:

  1. Visualize the distribution of log_antibody
ggplot(dat1, aes(x = log_antibody)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "white") +
  labs(title = "Distribution of Antibody Levels", x = "log_antibody", y = "Count") +
  theme_minimal()

  1. Scatterplot: log_antibody vs. time: See how antibody levels change over time post-vaccination.
plot_ly(
  data = dat1,
  x = ~time,
  y = ~log_antibody,
  type = "scatter",
  mode = "markers", marker = list(opacity = 0.3)
) %>%
  add_lines(
    x = ~time,
    y = ~fitted(loess(log_antibody ~ time, data = dat1)),
    line = list(color = "red"),
    name = "LOESS Smooth"
  ) %>%
  layout(
    title = "Antibody Levels Over Time",
    xaxis = list(title = "Time Since Vaccination (days)"),
    yaxis = list(title = "log_antibody")
  )
  1. Boxplots by categorical variables: To compare antibody levels between different subgroups

Gender

p_gender = ggplot(dat1, aes(x = factor(gender), y = log_antibody)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Antibody Levels by Gender", x = "Gender (0 = Female, 1 = Male)", y = "log_antibody") +
  theme_minimal()

p_gender

Smoking status

p_smoking = ggplot(dat1, aes(x = factor(smoking), y = log_antibody)) +
  geom_boxplot(fill = "plum") +
  labs(title = "Antibody Levels by Smoking Status", x = "Smoking (0=Never, 1=Former, 2=Current)", y = "log_antibody") +
  theme_minimal()

p_smoking

* Never and former smokers have similar distributions, with slightly higher medians. * Current smokers tend to have slightly lower antibody levels. * Smoking status may have a modest effect on immune response.

Diabetes

p_diabetes = ggplot(dat1, aes(x = factor(diabetes), y = log_antibody)) +
  geom_boxplot(fill = "salmon") +
  labs(title = "Antibody Levels by Diabetes Status", x = "Diabetes (0 = No, 1 = Yes)", y = "log_antibody") +
  theme_minimal()

p_diabetes

Race

p_race = ggplot(dat1, aes(x = factor(race), y = log_antibody)) +
  geom_boxplot(fill = "khaki") +
  labs(
    title = "Antibody Levels by Race",
    x = "Race (1 = White, 2 = Asian, 3 = Black, 4 = Hispanic)",
    y = "log_antibody"
  ) +
  theme_minimal()
p_race

* No substantial differences across race groups. * Medians are nearly identical. * Interpretation: * Race may not be a strong predictor of antibody levels here.

Hypertension status

p_htn = ggplot(dat1, aes(x = factor(hypertension), y = log_antibody)) +
  geom_boxplot(fill = "lightgreen") +
  labs(
    title = "Antibody Levels by Hypertension Status",
    x = "Hypertension (0 = No, 1 = Yes)",
    y = "log_antibody"
  ) +
  theme_minimal()
p_htn

We will creat a panel including all our categrical predictors analysis:

(p_gender | p_race | p_smoking) /
(p_diabetes | p_htn) 

  1. Scatterplots with continuous predictors

BMI

p_bmi = ggplot(dat1, aes(x = bmi, y = log_antibody)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "loess", se = FALSE, color = "darkgreen") +
  labs(title = "Antibody levels by BMI", x = "BMI", y = "log_antibody") +
  theme_minimal()

p_bmi

* Antibody levels are relatively flat until ~25–28 BMI, then slightly decline. * Weak, nonlinear negative association at higher BMI.

Antibody vs. Age

p_age = ggplot(dat1, aes(x = age, y = log_antibody)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "loess", se = FALSE, color = "blue") +
  labs(title = "Antibody Levels by Age", x = "Age (years)", y = "log_antibody") +
  theme_minimal()
p_age

* Slight decline in antibody levels as age increases. * Smooth trend suggests a gradual, nonlinear effect.

Antibody vs. Systolic Blood Pressure (SBP)

p_sbp = ggplot(dat1, aes(x = SBP, y = log_antibody)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "loess", se = FALSE, color = "darkred") +
  labs(title = "Antibody Levels by Systolic Blood Pressure", x = "SBP (mmHg)", y = "log_antibody") +
  theme_minimal()

p_sbp

* No clear trend — curve is almost flat across the SBP range. * SBP appears to have minimal or no relationship with antibody levels.

Antibody vs. LDL Cholesterol

p_ldl = ggplot(dat1, aes(x = LDL, y = log_antibody)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "loess", se = FALSE, color = "purple") +
  labs(title = "Antibody Levels by LDL Cholesterol", x = "LDL (mg/dL)", y = "log_antibody") +
  theme_minimal()

p_ldl

(p_bmi | p_age) /
(p_sbp | p_ldl)

Next step, we choose the GAM model training for option 1

mod_gam_full = gam(
  log_antibody ~ s(time) + s(age) + s(bmi) +
    LDL + SBP +
    gender + race + smoking + diabetes + hypertension,
  data = dat1
)

summary(mod_gam_full)      
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log_antibody ~ s(time) + s(age) + s(bmi) + LDL + SBP + gender + 
##     race + smoking + diabetes + hypertension
## 
## Parametric coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.1728744  0.2041632  49.827  < 2e-16 ***
## LDL          -0.0002465  0.0003873  -0.636    0.525    
## SBP           0.0008631  0.0016387   0.527    0.598    
## gender       -0.2957105  0.0149976 -19.717  < 2e-16 ***
## race         -0.0097455  0.0069589  -1.400    0.161    
## smoking      -0.0561992  0.0112546  -4.993 6.13e-07 ***
## diabetes      0.0154195  0.0207306   0.744    0.457    
## hypertension -0.0163727  0.0250865  -0.653    0.514    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df      F p-value    
## s(time) 7.862  8.561  46.75  <2e-16 ***
## s(age)  1.000  1.000 114.18  <2e-16 ***
## s(bmi)  5.127  6.197  68.43  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.214   Deviance explained = 21.7%
## GCV = 0.2809  Scale est. = 0.27966   n = 5000
plot(mod_gam_full, pages = 1)

All three smooth terms are highly significant (p < 2e-16)

Time, age, and BMI have nonlinear relationships with log_antibody

Their estimated degrees of freedom (edf):

Interpretation:Smooth terms for time, age, and BMI were statistically significant, indicating non-linear effects on antibody levels. The model automatically adapted flexibility for each predictor based on the data.

Model performance of the traning set

# Predict log_antibody in the test set
pred_gam = predict(mod_gam_full, newdata = dat2)

# True observed values
true_values = dat2$log_antibody


# Create a tibble with true and predicted values
eval_df = tibble(
  truth = dat2$log_antibody,
  prediction = pred_gam
)

# Get multiple metrics at once
metrics(eval_df, truth = truth, estimate = prediction)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.539
## 2 rsq     standard       0.171
## 3 mae     standard       0.433

Using the GAM model on this external dataset, the model achieved a root mean squared error (RMSE) of 0.539 and a mean absolute error (MAE) of 0.433, indicating modest prediction error. The model explained approximately 17.1% of the variation in log-transformed antibody levels (R² = 0.171). While performance was not exceptionally high, this is consistent with the inherent complexity and noise in biological data. The relatively stable performance across the two datasets suggests that the model generalizes reasonably well to new individuals.